/****************************************************************************************************
Do file name: 3_MergeStrings.do
Author: Thiago Scot
This version: April 16th, 2019
DEscription: Performs string match between judges exam data and CNJ performance data. Takes a long time to run.
Output is a "merge dictionary", containing name and identifiers that connect the two databases
****************************************************************************************************/


use  "$temp/judges_merged_concurso.dta", clear

rename  fullname Judge
drop _merge

*Keep only state judges for whom we have data on productivity.
drop if Estado == "Federal" // (171 observations deleted)

do "$do/3b_Judge_clean.do"

preserve
	clear
	use "$temp/CnjDataset_clean.dta"
	gduplicates drop Judge, force
	do "$do/3b_Judge_clean.do"
	tempfile temp
	save "`temp'"
restore

/*IMPORTANT: Only keep matches with similarity score >0.9. With this level we should have no false positives,
but for sure are dropping some potential matches with lower similarity score.*/

matchit Judge_ID Judge using "`temp'" , idusing(Judge_idP) txtusing(Judge) override t(0.9)


gsort Judge -similscore
by Judge: gen rank = _n

tab rank
/* vast majority of judges matched is uniquely matched to a name. Keep the best match.
       rank |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |      2,591       94.94       94.94
          2 |        125        4.58       99.52
          3 |         10        0.37       99.89
          4 |          2        0.07       99.96
          5 |          1        0.04      100.00
------------+-----------------------------------
      Total |      2,729      100.00

*/
drop if rank!=1												//Keep highest similarity score by judge 
gen _merge_match = 3

save  "$temp/judges_matchit_new.dta", replace
